import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder,normalize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier,VotingClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from category_encoders.binary import BinaryEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score,classification_report
from sklearn.metrics import r2_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")
df= pd.read_csv("../data/processed/processed_heart_attack.csv")
df.head()
| Age | Cholesterol | Heart Rate | Exercise Hours Per Week | Diet | Stress Level | Sedentary Hours Per Day | Income | BMI | Triglycerides | Physical Activity Days Per Week | Sleep Hours Per Day | Heart Attack Risk | Systolic | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 67 | 208 | 72 | 4.168189 | 1 | 9 | 6.615001 | 261404 | 31.251233 | 286 | 0 | 6 | 0 | 158 |
| 1 | 21 | 389 | 98 | 1.813242 | 0 | 1 | 4.963459 | 285768 | 27.194973 | 235 | 1 | 7 | 0 | 165 |
| 2 | 21 | 324 | 72 | 2.078353 | 2 | 9 | 9.463426 | 235282 | 28.176571 | 587 | 4 | 4 | 0 | 174 |
| 3 | 84 | 383 | 73 | 9.828130 | 1 | 9 | 7.648981 | 125640 | 36.464704 | 378 | 3 | 4 | 0 | 163 |
| 4 | 66 | 318 | 93 | 10.070897 | 0 | 6 | 1.514821 | 160555 | 21.809144 | 231 | 1 | 5 | 0 | 91 |
df.columns
Index(['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Diet',
'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI',
'Triglycerides', 'Physical Activity Days Per Week',
'Sleep Hours Per Day', 'Heart Attack Risk', 'Systolic'],
dtype='object')
x =df.drop('Heart Attack Risk',axis=1)
y = df['Heart Attack Risk']
x_train ,x_test ,y_train ,y_test =train_test_split(x ,y ,random_state=0,test_size=0.2)
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)
HyperParameter Tunning directly affects model performance.
==> each model has its hyperparameter and each parameter has number of different values so the question now which the best values of this parameter that achieve the best performance and accuracy.
HyperParameter Tunning such as : GridsearchCV and Randomized Search.
1) GridsearchCV: GridSearchCV exhaustively considers all parameter combinations.
GridsearchCV advantage: more accuracy than Randomized Search as in Gridsearch try all the possibilities in value range and runnig models depend on number of possiblilties.
GridsearchCV disadvantage: take alot of time and high cost.
The GridSearchCV instance implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.
2) Randomized Search: here try random values and learn random models.
Randomized Search advantage: cost and time redduction.
Randomized Search disadvantage: less accuracy than GridsearchCV .
'''
HyperParameter Tunning (GridSearch)
1) For SVC
'''
model=SVC()
params = [
{'C':[1, 10], 'kernel':['linear', 'sigmoid', 'poly'],'random_state':range(0,10)},
{'C':[1, 10], 'kernel':['rbf'], 'gamma':[0.5, 0.6, 0.7, 0.1, 0.01, 0.01],'random_state':range(0,10)}
]
grid_search_svc=GridSearchCV(estimator=model,
param_grid=params,
scoring='recall',
n_jobs=-1)
grid_search_svc.fit(x_train,y_train)
GridSearchCV(estimator=SVC(), n_jobs=-1,
param_grid=[{'C': [1, 10], 'kernel': ['linear', 'sigmoid', 'poly'],
'random_state': range(0, 10)},
{'C': [1, 10],
'gamma': [0.5, 0.6, 0.7, 0.1, 0.01, 0.01],
'kernel': ['rbf'], 'random_state': range(0, 10)}],
scoring='recall')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=SVC(), n_jobs=-1,
param_grid=[{'C': [1, 10], 'kernel': ['linear', 'sigmoid', 'poly'],
'random_state': range(0, 10)},
{'C': [1, 10],
'gamma': [0.5, 0.6, 0.7, 0.1, 0.01, 0.01],
'kernel': ['rbf'], 'random_state': range(0, 10)}],
scoring='recall')SVC()
SVC()
'''
best parms here is that when C:1 and kernel eq 'linear' and random_state eq 0 ==>default'
'''
grid_search_svc.best_params_
{'C': 10, 'kernel': 'sigmoid', 'random_state': 0}
'''
here SVC score is 0.312
'''
grid_search_svc.best_score_
0.312
'''
HyperParameter Tunning (GridSearch)
2) For KNN
'''
knn_classifer=KNeighborsClassifier()
params = [{'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance'],
'algorithm':['ball_tree','kd_tree','brute'],
'metric':['cityblock','cosine','euclidean','l1','l2','haversine','manhattan','nan_euclidean','minkowski'],
'leaf_size': [15, 40]}]
grid_search_knn = GridSearchCV(knn_classifer,
param_grid=params,
scoring='recall')
grid_search_knn.fit(x_train, y_train)
GridSearchCV(estimator=KNeighborsClassifier(),
param_grid=[{'algorithm': ['ball_tree', 'kd_tree', 'brute'],
'leaf_size': [15, 40],
'metric': ['cityblock', 'cosine', 'euclidean', 'l1',
'l2', 'haversine', 'manhattan',
'nan_euclidean', 'minkowski'],
'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance']}],
scoring='recall')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=KNeighborsClassifier(),
param_grid=[{'algorithm': ['ball_tree', 'kd_tree', 'brute'],
'leaf_size': [15, 40],
'metric': ['cityblock', 'cosine', 'euclidean', 'l1',
'l2', 'haversine', 'manhattan',
'nan_euclidean', 'minkowski'],
'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance']}],
scoring='recall')KNeighborsClassifier()
KNeighborsClassifier()
grid_search_knn.best_params_
{'algorithm': 'brute',
'leaf_size': 15,
'metric': 'cosine',
'n_neighbors': 3,
'weights': 'distance'}
'''
here KNN score is 0.2988
'''
grid_search_knn.best_score_
0.2988
grid = [{'n_estimators':[100,300,500], 'max_depth':[None, 5,10,15],
'min_samples_split':[2,5,10] , 'min_samples_leaf':[1,2,4]}]
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rfc, param_grid = grid, scoring = 'accuracy',n_jobs = -1,verbose = 2)
grid_search = grid_search.fit(x_train, y_train)
Fitting 5 folds for each of 108 candidates, totalling 540 fits
grid_search_rfc = GridSearchCV(rfc,
param_grid=grid,
scoring='recall')
grid_search.best_params_
{'max_depth': 5,
'min_samples_leaf': 1,
'min_samples_split': 2,
'n_estimators': 100}
grid_search.best_score_
0.6433666191155493
'''
The best score from Hyper parameter tunning of Random forest model which is 0.643509272467903
'''
'\nThe best score from Hyper parameter tunning of Random forest model which is 0.64365192582025698\n'
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba[:, 1])
pd.options.display.float_format = '{:,.10f}'.format
df_Recall_per = pd.DataFrame({'Threshold': thresholds, 'Precision': precisions[:-1], 'Recall': recalls[:-1]})
df_Recall_per.head()
| Threshold | Precision | Recall | |
|---|---|---|---|
| 0 | 0.1200000000 | 0.3645179692 | 1.0000000000 |
| 1 | 0.1400000000 | 0.3641552511 | 0.9984350548 |
| 2 | 0.1500000000 | 0.3643632210 | 0.9984350548 |
| 3 | 0.1600000000 | 0.3645714286 | 0.9984350548 |
| 4 | 0.1700000000 | 0.3647798742 | 0.9984350548 |
import plotly.express as px
px.line(df_Recall_per, x='Threshold', y='Precision', title='Precision vs Threshold', width=800, height=600)
px.line(df_Recall_per, x='Threshold', y='Recall', title='Recall vs Threshold', width=800, height=600)
px.line(df_Recall_per, x='Recall', y='Precision', title='Precision-Recall Curve', width=800, height=600,hover_data=['Threshold'])
# Find Threshold with Recall >= 0.6
threshold = df_Recall_per.loc[df_Recall_per['Recall'] >= 0.6, 'Threshold'].max()
print('Threshold: ', threshold)
print('Precision: ', df_Recall_per.loc[df_Recall_per['Threshold'] == threshold, 'Precision'].values[0])
print('Recall: ', df_Recall_per.loc[df_Recall_per['Threshold'] == threshold, 'Recall'].values[0])
Threshold: 0.35 Precision: 0.3717948717948718 Recall: 0.6353677621283255
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba[:, 1])
df_ROC = pd.DataFrame({'Threshold': thresholds, 'FPR': fpr, 'TPR': tpr})
df_ROC.head(10)
| Threshold | FPR | TPR | |
|---|---|---|---|
| 0 | inf | 0.0000000000 | 0.0000000000 |
| 1 | 0.6200000000 | 0.0008976661 | 0.0000000000 |
| 2 | 0.6100000000 | 0.0017953321 | 0.0031298905 |
| 3 | 0.5900000000 | 0.0026929982 | 0.0031298905 |
| 4 | 0.5700000000 | 0.0044883303 | 0.0093896714 |
| 5 | 0.5600000000 | 0.0071813285 | 0.0125195618 |
| 6 | 0.5500000000 | 0.0116696589 | 0.0140845070 |
| 7 | 0.5400000000 | 0.0134649910 | 0.0172143975 |
| 8 | 0.5300000000 | 0.0179533214 | 0.0187793427 |
| 9 | 0.5200000000 | 0.0260323160 | 0.0250391236 |
px.line(df_ROC, x='FPR', y='TPR', title='ROC Curve', width=800, height=600, hover_data=['Threshold'])
models={
'KNN':KNeighborsClassifier(n_neighbors=5),
'SVC':SVC(C=10, kernel= 'sigmoid', random_state = 0),
'RF':RandomForestClassifier(n_estimators=100,min_samples_split=2,min_samples_leaf=1,max_depth=5),
'Bagging_classifier':BaggingClassifier(DecisionTreeClassifier(),n_estimators=5,n_jobs=-1),
'voting': VotingClassifier(estimators=[('LR',LogisticRegression()),('NB',GaussianNB()),('DT',DecisionTreeClassifier())])
}
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,recall_score,precision_score
import joblib
for name,model in models.items():
print('--------- ',name,'-------------')
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print('accuracy_training: ',accuracy_score(y_train,model.predict(x_train)))
print('accuracy_testing: ',accuracy_score(y_pred,y_test))
print('confusion matrix: ',confusion_matrix(y_test,y_pred))
print('recall score: ',recall_score(y_test,y_pred,average='weighted'))
print('precision score: ',precision_score(y_test,y_pred,average='weighted'))
joblib.dump(model,name+'_model.h5')
print('-'*30)
'''
the better result from classification is Random forest model.
'''
--------- KNN ------------- accuracy_training: 0.7174037089871612 accuracy_testing: 0.5556189389617798 confusion matrix: [[824 290] [489 150]] recall score: 0.5556189389617798 precision score: 0.5230772331259191 ------------------------------ --------- SVC ------------- accuracy_training: 0.4864479315263909 accuracy_testing: 0.5464917284654878 confusion matrix: [[738 376] [419 220]] recall score: 0.5464917284654878 precision score: 0.539899972974961 ------------------------------ --------- RF ------------- accuracy_training: 0.6433666191155493 accuracy_testing: 0.6354820308043354 confusion matrix: [[1114 0] [ 639 0]] recall score: 0.6354820308043354 precision score: 0.40383741147520236 ------------------------------ --------- Bagging_classifier ------------- accuracy_training: 0.9504992867332382 accuracy_testing: 0.5681688533941814 confusion matrix: [[825 289] [468 171]] recall score: 0.5681688533941814 precision score: 0.5409755661476753 ------------------------------ --------- voting ------------- accuracy_training: 0.6433666191155493 accuracy_testing: 0.6354820308043354 confusion matrix: [[1114 0] [ 639 0]] recall score: 0.6354820308043354 precision score: 0.40383741147520236 ------------------------------
'\nthe better result from classification is Random forest model.\n'
'''
make this command in order to get features that i will use after that in deployment.
'''
df.columns
Index(['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Diet',
'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI',
'Triglycerides', 'Physical Activity Days Per Week',
'Sleep Hours Per Day', 'Heart Attack Risk', 'Systolic'],
dtype='object')
features = ['Age', 'Cholesterol', 'Heart Rate', 'Exercise Hours Per Week', 'Diet',
'Stress Level', 'Sedentary Hours Per Day', 'Income', 'BMI',
'Triglycerides', 'Physical Activity Days Per Week',
'Sleep Hours Per Day', 'Heart Attack Risk', 'Systolic']
'''
this to having feature and scaler that help me in deployment.
'''
joblib.dump(features,'features.h5')
joblib.dump(scaler,'scaler.h5')
['scaler.h5']
df
| Age | Cholesterol | Heart Rate | Exercise Hours Per Week | Diet | Stress Level | Sedentary Hours Per Day | Income | BMI | Triglycerides | Physical Activity Days Per Week | Sleep Hours Per Day | Heart Attack Risk | Systolic | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 67 | 208 | 72 | 4.168189 | 1 | 9 | 6.615001 | 261404 | 31.251233 | 286 | 0 | 6 | 0 | 158 |
| 1 | 21 | 389 | 98 | 1.813242 | 0 | 1 | 4.963459 | 285768 | 27.194973 | 235 | 1 | 7 | 0 | 165 |
| 2 | 21 | 324 | 72 | 2.078353 | 2 | 9 | 9.463426 | 235282 | 28.176571 | 587 | 4 | 4 | 0 | 174 |
| 3 | 84 | 383 | 73 | 9.828130 | 1 | 9 | 7.648981 | 125640 | 36.464704 | 378 | 3 | 4 | 0 | 163 |
| 4 | 66 | 318 | 93 | 10.070897 | 0 | 6 | 1.514821 | 160555 | 21.809144 | 231 | 1 | 5 | 0 | 91 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8758 | 60 | 121 | 61 | 7.917342 | 2 | 8 | 10.806373 | 235420 | 19.655895 | 67 | 7 | 7 | 0 | 94 |
| 8759 | 28 | 120 | 73 | 16.558426 | 2 | 8 | 3.833038 | 217881 | 23.993866 | 617 | 4 | 9 | 0 | 157 |
| 8760 | 47 | 250 | 105 | 3.148438 | 1 | 5 | 2.375214 | 36998 | 35.406146 | 527 | 4 | 4 | 1 | 161 |
| 8761 | 36 | 178 | 60 | 3.789950 | 0 | 5 | 0.029104 | 209943 | 27.294020 | 114 | 2 | 8 | 0 | 119 |
| 8762 | 25 | 356 | 75 | 18.081748 | 2 | 8 | 9.005234 | 247338 | 32.914151 | 180 | 7 | 4 | 1 | 138 |
8763 rows × 14 columns
'''
- The result of Feature selection using both "high correlation filter" or "Random Forest Model" as per below:
1) High correlation between Heart Attack Risk and 'Sedentary Hours Per Day' , 'BMI' , 'Exercise Hours Per Week' , 'Income'
, 'Triglycerides' , 'Cholesterol' , 'Age' , 'Heart Rate' , 'Systolic' , 'Stress Level' , 'Physical Activity Days Per Week',
'Sleep Hours Per Day' , 'Diet' .
- GridsearchCV- HyperParmeter Tunning Result ==> The best alogorithm is Random Forest Model.
- The best classification model is Random Forest Algorithm as it has more accuracy_training and accuracy_testing.
'''
'\n\n- The result of Feature selection using both "high correlation filter" or "Random Forest Model" as per below:\n1) High correlation between Heart Attack Risk and \'Sedentary Hours Per Day\' , \'BMI\' , \'Exercise Hours Per Week\' , \'Income\' \n, \'Triglycerides\' , \'Cholesterol\' , \'Age\' , \'Heart Rate\' , \'Systolic\' , \'Stress Level\' , \'Physical Activity Days Per Week\',\n\'Sleep Hours Per Day\' , \'Diet\' .\n\n\n- GridsearchCV- HyperParmeter Tunning Result ==> The best alogorithm is Random Forest Model.\n\n- The best classification model is Random Forest Algorithm as it has more accuracy_training and accuracy_testing.\n\n\n'